library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
rats dataset using the summary measure
approachIn this problem, we’re using the summary measure approach on the
rats dataset.
The rats data is from a nutrition study presented in the
textbook Analysis of Repeated Measures (Crowdeer & Hand,
1990) on three groups of rats, each group given a different diet. The
rats were weighed weekly over a 9-week period to study the effects of
the different diets.
rats <- read_csv("data/rats.csv") %>% mutate(ID = factor(ID)) %>% mutate(Group = factor(Group))
## Rows: 176 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (4): ID, Group, Bodyweight, Time
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Let’s inspect:
Below, there are three tabs in the notebook, you can switch between them to look at different aspects of the data.
We start by looking at the distribution of initial and final bodyweights to see if we can detect a trend.
initial_time <- min(rats$Time)
final_time <- max(rats$Time)
initial_rats <- rats %>% filter(Time == initial_time)
final_rats <- rats %>% filter(Time == final_time)
min_range <- min(rats$Bodyweight)
max_range <- max(rats$Bodyweight)
ggplot() + geom_density(data= initial_rats, bw = "SJ", aes(x=Bodyweight, fill = "Baseline bodyweight", alpha=0.4)) +
geom_density(data= final_rats, bw = "SJ", aes(x=Bodyweight, fill = "Final bodyweight", alpha=0.4)) +
scale_fill_manual(values=c("Baseline bodyweight" = "#cccccc","Final bodyweight" = "#ff9955")) +
labs(fill= "Weight distribution") +
scale_x_continuous(limits = c(min_range, max_range)) +
scale_alpha(guide = "none")
fig. 1.1, Baseline and final rat weight distributions
There is a clear shift towards a heavier distribution of bodyweights over the course of the experiment — the rats are growing. It looks like there’s three modes in both distribution, but in the final bodyweight distribution, the middle mode has gotten close to the heaviest mode.
Let’s look at the initial bodyweights of the three groups.
initial_rats %>%
ggplot(aes(x=Bodyweight, group=Group, fill=Group, alpha = 0.5)) +
geom_density(bw = "SJ")+
scale_alpha(guide = "none")
fig. 1.2, Baseline weight distributions per group
Hmm, the groups seem to have different bodyweight distributions, the modes do not overlap, and I have a feeling that the difference between groups is much greater than the difference between the growth from initial to final measurements. This is even clearer in the next tab to the right.
ggplot() + geom_density(data= initial_rats, bw = "SJ", aes(x=Bodyweight, group=Group, fill = paste("Group",Group,"baseline"), alpha=0.4)) +
geom_density(data= final_rats, bw = "SJ", aes(x=Bodyweight, group=Group, fill = paste("Group",Group,"final"), alpha=0.4)) +
scale_fill_manual(values=c("Group 1 baseline" = "#ff9999",
"Group 2 baseline" = "#99ff99",
"Group 3 baseline" = "#9999ff",
"Group 1 final" = "#dd2222",
"Group 2 final" = "#22dd22",
"Group 3 final" = "#2222dd")) +
labs(fill= "Weight distribution") +
scale_x_continuous(limits = c(min_range, max_range)) +
scale_alpha(guide = "none")
fig. 1.3, Baseline and final weight distributions per group
The shift of the distributions are visible, but they are still smaller than the initial differences between the weight distributions. In my mind this would make the groups a little difficult to compare, since we can’t be sure that the effect of the baseline bodyweight doesn’t behave differently for each group. The green high peak is a single outlier, as we will see when looking at the line plots.
rat_trend <- rats %>% group_by(Time) %>%
summarise(mean = mean(Bodyweight), se = sd(Bodyweight)/sqrt(n())) %>%
ungroup()
rat_trend %>% ggplot(aes(x=Time, y = mean)) +
geom_line(aes(color="mean +- se")) +
geom_errorbar(aes(ymin=mean-se, ymax=mean+se, color = "mean +- se")) +
labs(color= "Group") +
ylab("Bodyweight") +
geom_line(data=rats, aes(x=Time, y=Bodyweight, color = Group, group=ID))
fig. 1.4, Mean growth and plot of each rats bodyweight increase
We see an issue with taking the mean bodyweight of all rats: none of the rats are at the mean, they straddle the mean on both sides. We also see more clearly in this plot that one of the groups (group 1) has double the number of rats that the others do (8 vs. 4). The outlier in the green group is also much more apparent here, we might have to remove it.
The lines are difficult to compare because the groups’ baselines are
so different. If we could get all of the lines to start at the same
location, we may be able to compare them better, so let’s create a new
column for each observation subtracting the baseline bodyweight from the
measured bodyweight, which we can call Weight_gain.
And then we can look at graphical displays of that data.
rats <- rats %>%
mutate(Weight_gain = Bodyweight - filter(initial_rats, ID == ID)$Bodyweight) %>%
mutate(Baseline_weight = initial_rats[as.integer(ID),]$Bodyweight)
checksum <- rats$Baseline_weight - rats$Bodyweight + rats$Weight_gain
all(checksum == 0)
## [1] TRUE
The initial distribution of weight gain is constant (all zeros) so we will only plot the overall weight gain distribution after 9 weeks.
final_rats <- rats %>% filter(Time == final_time)
ggplot() + geom_density(data=final_rats, bw = "SJ", aes(x=Weight_gain, fill = "0")) +
scale_fill_manual(values=c("0"="#ff9955"), guide = "none") +
xlab("Weight gain after 9 weeks")
fig. 2.1, Weight gain distribution after 9 weeks
There seem to be two modes here, which may indicate a difference between groups.
Let’s look at the final bodyweights of the three groups.
final_rats %>%
ggplot(aes(x=Weight_gain, group=Group, fill=Group, alpha = 0.5)) +
geom_density(bw = "SJ")+
scale_alpha(guide = "none")+
xlab("Weight gain after 9 weeks")
fig. 2.2, Weight gain distribution per group after 9 weeks
Okay, there seem to be some difference between the red (1) and green (3) groups, but these were also the groups with the largest differences in the baseline.
rat_trend <- rats %>% group_by(Time) %>%
summarise(mean = mean(Bodyweight), mean_gain = mean(Weight_gain), se = sd(Bodyweight)/sqrt(n()),se_gain = sd(Weight_gain)/sqrt(n())) %>%
ungroup()
rat_trend %>% ggplot(aes(x=Time, y = mean_gain,ymin=mean_gain-se_gain, ymax=mean_gain+se_gain)) +
geom_line(aes(color="mean +- se")) +
geom_errorbar(aes(color = "mean +- se")) +
geom_ribbon(aes(fill="0",alpha=0.1)) +
labs(color= "Group") +
ylab("Weight gain") +
scale_fill_manual(guide="none", values =(c("0"="#cccccc"))) +
scale_alpha(guide="none") +
geom_line(data=rats, aes(x=Time, y=Weight_gain,ymin =0, ymax=0, color = Group, group=ID))
## Warning in geom_line(data = rats, aes(x = Time, y = Weight_gain, ymin = 0, :
## Ignoring unknown aesthetics: ymin and ymax
fig. 2.3, Mean growth and plot of each rats bodyweight increase
Looking at the weight gain of each rat, we do start to see some
Let’s standardize the dataset and plot that last plot
(Growth Trend) again, for both the body weight and the
weight gain
rats_std <- rats %>%
group_by(Time) %>%
mutate(std_weight = scale(Bodyweight)) %>%
mutate(std_weight_gain = if_else(Weight_gain != 0, scale(Weight_gain), 0)) %>%
ungroup()
std_rat_trend <- rats_std %>% group_by(Time) %>%
summarise(mean = mean(std_weight), mean_gain = mean(std_weight_gain), se = sd(std_weight)/sqrt(n()),se_gain = sd(std_weight_gain)/sqrt(n())) %>%
ungroup()
weight_plot <- std_rat_trend %>% ggplot(title = "", aes(x=Time, y = mean,ymin=mean-se, ymax=mean+se)) +
geom_line(aes(color="mean +- se", linetype="2")) +
geom_errorbar(aes(ymin=mean-se, ymax=mean+se, color = "mean +- se"), width=1.5, size = 0.3) +
geom_ribbon(aes(fill="0",alpha=0.1)) +
ylab("std. weight") +
geom_line(data=rats_std, aes(x=Time, y=std_weight, ymin=0, ymax=0, color = Group, group=ID, linetype="1")) +
scale_fill_manual(guide="none", values =(c("0"="#cccccc"))) +
theme(legend.position="none")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning in geom_line(data = rats_std, aes(x = Time, y = std_weight, ymin = 0, :
## Ignoring unknown aesthetics: ymin and ymax
gain_plot <- std_rat_trend %>% ggplot(aes(x=Time, y = mean_gain, ymin=mean_gain-se_gain, ymax=mean_gain+se_gain)) +
geom_line(aes(color="mean +- se", linetype="2")) +
geom_errorbar(aes( color = "mean +- se"), width=1.5, size = 0.3) +
geom_ribbon(aes(fill="0",alpha=0.1)) +
labs(color= "Group") +
ylab("std. weight gain") +
geom_line(data=rats_std, aes(x=Time, y=std_weight_gain, ymin=0, ymax=0, color = Group, group=ID, linetype="1")) +
scale_linetype(guide="none") +
scale_fill_manual(guide="none", values =(c("0"="#cccccc"))) +
scale_alpha(guide="none") +
theme(legend.position = c(0.92,0.105))
## Warning in geom_line(data = rats_std, aes(x = Time, y = std_weight_gain, :
## Ignoring unknown aesthetics: ymin and ymax
gridExtra::grid.arrange(weight_plot,gain_plot, ncol=2)
fig. 3.1, Standardized weight and weight gain per rat, and all-groups mean
The standardized bodyweight plot is difficult to interpret because of the difference in baselines, but we can identify a potential outlier in the green group.
The standardized weight gain plot does show some patterns, we see some spread among the groups, which we can dive deeper into.
Now let’s do that same plot but instead of individuals, let’s plot the mean and standard error per group
std_rat_group_trends <- rats_std %>% group_by(Time, Group) %>%
summarise(mean = mean(std_weight), mean_gain = mean(std_weight_gain), se = sd(std_weight)/sqrt(n()),se_gain = sd(std_weight_gain)/sqrt(n())) %>%
ungroup()
## `summarise()` has grouped output by 'Time'. You can override using the
## `.groups` argument.
sum_weight_plot <- std_rat_group_trends %>% ggplot(aes(x = Time, y = mean,ymin=mean-se, ymax=mean+se, color = Group)) +
ggtitle("Bodyweight per group") +
geom_line() +
geom_point(size=3) +
geom_linerange() +
geom_ribbon(aes(fill=Group,alpha = 0.1)) +
scale_y_continuous(name = "mean(std. weight) +/- se(std. weight)") +
theme(legend.position="none")
sum_gain_plot <- std_rat_group_trends %>% ggplot(aes(x = Time,ymin=mean_gain-se_gain, ymax=mean_gain+se_gain, y = mean_gain, color = Group)) +
ggtitle("Weight gain per group") +
geom_line() +
geom_point(size=3) +
geom_linerange() +
geom_ribbon(aes(fill=Group,alpha = 0.1)) +
scale_y_continuous(name = "mean(std. weight gain) +/- se(std. weight gain)") +
scale_alpha(guide="none") +
theme(legend.position = c(0.95,0.08))
gridExtra::grid.arrange(sum_weight_plot,sum_gain_plot, ncol=2)
fig. 3.2, Bodyweight and weight gain over time per group
Looking at the weight gain plot specifically, we do see the three groups clearly separating, but we still don’t know how much of this effect would be because of the baseline weight of the rats or the treatment, since both of these variables differed between the groups.
weight_boxplot <- final_rats %>% ggplot(aes(y=Bodyweight)) +
geom_boxplot() +
facet_wrap(~Group)
gain_boxplot <- final_rats %>% ggplot(aes(y=Weight_gain)) +
geom_boxplot() +
facet_wrap(~Group)
gridExtra::grid.arrange(weight_boxplot, gain_boxplot, ncol=2)
fig. 4.1, boxplots of the three groups
There is a rat in group 2 that seems to be increasing the variance in the group, which would make t-tests more difficult to run. Let’s remove it
outlier_ids <- (final_rats %>% filter(Bodyweight > 600))$ID
rats <- rats %>% filter(!(ID %in% outlier_ids))
initial_rats <- rats %>% filter(Time == initial_time)
final_rats <- rats %>% filter(Time == final_time)
weight_boxplot2 <- final_rats %>% ggplot(aes(y=Bodyweight)) +
geom_boxplot() +
facet_wrap(~Group)
gain_boxplot2 <- final_rats %>% ggplot(aes(y=Weight_gain)) +
geom_boxplot() +
facet_wrap(~Group)
gridExtra::grid.arrange(weight_boxplot2, gain_boxplot2, ncol=2)
fig. 4.2, boxplots of the three groups, outliers removed
There is still seemingly one outlier, but the difference in variances between the groups now seems better.
One issue is that the variance of group 1 is much smaller than groups 2 and 3 — likely because there are twice as many rats in that group.
Visually we have identified a between-groups effect, but there are two possible causes of the effect: the baseline weight and the treatment (diet).
Based on table 8.2 from MABS4IODS, I will use as summary measure the final value of the weight gain, because there is a growth trend in the data. Another option for summary measures would have been the regression coefficients of the bodyweight of each individual rat.
As the baselines are different for each group, using unpaired t-tests we can only compare the weight gain between two groups. The tabs contain the t-tests for each comparison of two groups.
final_rats_12 <- final_rats %>% filter(Group != 3)
final_rats_13 <- final_rats %>% filter(Group != 2)
final_rats_23 <- final_rats %>% filter(Group != 1)
Let’s also check if the variances of the groups are similar:
final_rats %>% group_by(Group) %>%
summarise(var = var(Weight_gain))
## # A tibble: 3 × 2
## Group var
## <fct> <dbl>
## 1 1 86.1
## 2 2 1051
## 3 3 326.
they are not. I will therefore use Welch’s t-test (var.equal = FALSE) when doing the t-tests.
t.test(Weight_gain ~ Group, data = final_rats_12, var.equal = F)
##
## Welch Two Sample t-test
##
## data: Weight_gain by Group
## t = -2.0458, df = 2.1242, p-value = 0.1699
## alternative hypothesis: true difference in means between group 1 and group 2 is not equal to 0
## 95 percent confidence interval:
## -116.22098 38.47098
## sample estimates:
## mean in group 1 mean in group 2
## 23.125 62.000
t.test(Weight_gain ~ Group, data = final_rats_13, var.equal = F)
##
## Welch Two Sample t-test
##
## data: Weight_gain by Group
## t = -1.9138, df = 3.8172, p-value = 0.1316
## alternative hypothesis: true difference in means between group 1 and group 3 is not equal to 0
## 95 percent confidence interval:
## -45.543111 8.793111
## sample estimates:
## mean in group 1 mean in group 3
## 23.125 41.500
t.test(Weight_gain ~ Group, data = final_rats_23, var.equal = F)
##
## Welch Two Sample t-test
##
## data: Weight_gain by Group
## t = 0.98659, df = 2.932, p-value = 0.3981
## alternative hypothesis: true difference in means between group 2 and group 3 is not equal to 0
## 95 percent confidence interval:
## -46.50239 87.50239
## sample estimates:
## mean in group 2 mean in group 3
## 62.0 41.5
There seems to be some difference between groups 1 and 2 and groups 1 and 3, but the t-test is not significant at level 0.05 (the confidence intervals straddle 0). Based on this, we could conclude that there may be something causing the mice in group 1 to gain less weight than the other groups (but we still can’t say if it’s their diet).
Interestingly, the test statistic seems to be the same for
Group regardless of if we use the body weight or the weight
gain as the response variable, so let’s use Weight gain.
For each of the four combinations {1,2,3}, {1,2}, {1,3}, {2,3}, we fit a linear model and then we perform an analysis of variance (ANOVA) on the fit.
Dependent variable:
Independent variables:
final_fit_gain <- lm(Weight_gain ~ Baseline_weight + Group, data = final_rats)
anova(final_fit_gain)
## Analysis of Variance Table
##
## Response: Weight_gain
## Df Sum Sq Mean Sq F value Pr(>F)
## Baseline_weight 1 1308.3 1308.32 8.1039 0.015889 *
## Group 2 4072.2 2036.11 12.6120 0.001423 **
## Residuals 11 1775.9 161.44
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
final_fit_gain_12 <- lm(Weight_gain ~ Baseline_weight + Group, data = final_rats_12)
anova(final_fit_gain_12)
## Analysis of Variance Table
##
## Response: Weight_gain
## Df Sum Sq Mean Sq F value Pr(>F)
## Baseline_weight 1 2369.3 2369.30 15.279 0.004489 **
## Group 1 2392.3 2392.32 15.427 0.004370 **
## Residuals 8 1240.6 155.07
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
final_fit_gain_13 <- lm(Weight_gain ~ Baseline_weight + Group, data = final_rats_13)
anova(final_fit_gain_13)
## Analysis of Variance Table
##
## Response: Weight_gain
## Df Sum Sq Mean Sq F value Pr(>F)
## Baseline_weight 1 662.01 662.01 6.9201 0.02733 *
## Group 1 957.27 957.27 10.0066 0.01149 *
## Residuals 9 860.98 95.66
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
final_fit_gain_23 <- lm(Weight_gain ~ Baseline_weight + Group, data = final_rats_23)
anova(final_fit_gain_23)
## Analysis of Variance Table
##
## Response: Weight_gain
## Df Sum Sq Mean Sq F value Pr(>F)
## Baseline_weight 1 1870.8 1870.80 6.2693 0.06649 .
## Group 1 735.0 735.00 2.4631 0.19163
## Residuals 4 1193.6 298.41
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Now we can say that it seems like there is a significant effect due to diet, at p=0.001423, and that the effect is between group 1 and the other groups. (1 vs 2: p = 0.004489, 1 vs 3:p = 0.01149).
Based on the summary measure analysis, we can conclude that there is a statistically significant change in the mean weight gain of a rat when given the same diet that the first group of rats were given.
For this problem we’re creating a linear mixed effects model for the BPRS dataset.
BPRS is the brief psychiatric rating scale, used as indicator of schizophrenia. The dataset consists of BPRS evaluations from 40 subjects, with one observations per subject taken once a week for 9 weeks. Half the subjects were given one treatment, the other half another treatment.
bprs <- read_csv("data/bprs.csv") %>% mutate(subject = factor(subject)) %>% mutate(treatment = factor(treatment))
## Rows: 360 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (4): treatment, subject, bprs, week
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.